The following code is from my github page https://github.com/SCK22/HelperFunctions/blob/master/Plotting/HelperFunctionsPlotlyPlotting.py
Importing libraries, defninig new functions and re using some of the functions I wrote for performing quick analysis
"""Helper Functions for Plotting"""
import numpy as np
import plotly
import plotly.offline as pyoff
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected = True)
def generate_layout_bar(col_name):
"""
Generate a layout object for bar chart
"""
layout_bar = go.Layout(
autosize=False, # auto size the graph? use False if you are specifying the height and width
width=800, # height of the figure in pixels
height=600, # height of the figure in pixels
title="Distribution of {} column".format(col_name), # title of the figure
# more granular control on the title font
titlefont=dict(
family='Courier New, monospace', # font family
size=14, # size of the font
color='black' # color of the font
),
# granular control on the axes objects
xaxis=dict(
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the x axis
color='black' # color of the font
)
),
yaxis=dict(
# range=[0,100],
title='Percentage',
titlefont=dict(
size=14,
color='black'
),
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the y axis
color='black' # color of the font
)
),
font=dict(
family='Courier New, monospace', # font family
color="white", # color of the font
size=12 # size of the font displayed on the bar
)
)
return layout_bar
def plot_count_bar(dataframe_name, col_name, top_n=None):
"""
Plot a bar chart for the categorical columns
Arguments:
dataframe name
categorical column name
Output:
Plot
"""
# create a table with value counts
temp = dataframe_name[col_name].value_counts()
if top_n is not None:
temp = temp.head(top_n)
# creating a Bar chart object of plotly
data = [go.Bar(
x=temp.index.astype(str), # x axis values
y=np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100, # y axis values
text=['{}%'.format(i) for i in np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100],
# text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
textposition='auto', # specify at which position on the bar the text should appear
marker=dict(color='#0047AB'),)] # change color of the bar
# color used here Cobalt Blue
layout_bar = generate_layout_bar(col_name=col_name)
fig = go.Figure(data=data, layout=layout_bar)
return iplot(fig)
def plot_bar(dataframe_name, cat_col_name, num_col_name, top_n = 20):
"""
Plot a bar chart with the mentioned columns
Arguments:
dataframe name
categorical column name
numeric column name
Output:
Plot
"""
# create a table with value counts
dataframe_name = dataframe_name.sort_values(by = num_col_name, ascending = False)
dataframe_name = dataframe_name.head(top_n)
x = dataframe_name[cat_col_name]
y = dataframe_name[num_col_name]
# creating a Bar chart object of plotly
data = [go.Bar(
x=x, # x axis values
y=y, # y axis values
text=['{}%'.format(np.round(i,2)) for i in y],
# text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
textposition='auto', # specify at which position on the bar the text should appear
marker=dict(color='#0047AB'),)] # change color of the bar
# color used here Cobalt Blue
layout_bar = generate_layout_bar(col_name=cat_col_name)
fig = go.Figure(data=data, layout=layout_bar)
return iplot(fig)
def plot_hist(dataframe, col_name):
"""Plot histogram"""
data = [go.Histogram(x=dataframe[col_name],
marker=dict(
color='#CC0E1D', # Lava (#CC0E1D)
# color = 'rgb(200,0,0)' # you can provide color in HEX format or rgb format, genrally programmers prefer HEX format as it is a single string value and easy to pass as a variable
))]
layout = go.Layout(title="Histogram of {}".format(col_name))
fig = go.Figure(data=data, layout=layout)
return iplot(fig)
def plot_multi_box(dataframe, col_name, num_col_name):
"""Plot multiple box plots based on the levels in a column"""
data = []
for i in dataframe[col_name].unique():
trace = go.Box(y=dataframe[num_col_name][dataframe[col_name] == i],
name=i)
data.append(trace)
layout = go.Layout(title="Boxplot of levels in {} for {} column".format(col_name, num_col_name))
fig = go.Figure(data=data, layout=layout)
return (iplot(fig))
import sys
import os
import pandas as pd
from datetime import datetime
data_path = "data/"
def load_data(file_name):
return pd.read_csv(data_path+f"{file_name}"+".csv")
def convert_to_pandas_datetime(col):
return pd.to_datetime(col)
List all the files in the data path
os.listdir(data_path)
devices = load_data("rev-devices")
notifications = load_data("rev-notifications")
transactions = load_data("rev-transactions")
users = load_data("rev-users")
def dataset_info(df):
info = {}
info["n_rows"] = df.shape[0]
info["n_cols"] = df.shape[1]
print("First few rows")
print(df.head())
return info
dataset_info(devices)
dataset_info(notifications)
dataset_info(transactions)
dataset_info(users)
users.user_id.nunique()
Before we move ahead, let me convert the date to pandas date format for easier manipulation
users.created_date = convert_to_pandas_datetime(users.created_date)
transactions.created_date = convert_to_pandas_datetime(transactions.created_date)
notifications.created_date = convert_to_pandas_datetime(notifications.created_date)
print("We have user registration data for the {} days.".format((users.created_date.max() - users.created_date.min()).days))
print("We have user transactions for the {} days.".format((transactions.created_date.max() - transactions.created_date.min()).days))
print("We have user notifications for the {} days.".format((notifications.created_date.max() - notifications.created_date.min()).days))
We can infer the age of the user from users birth year
users["age"] = 2020 - users.birth_year
What is the age group of the users on the platform
plot_hist(users,"age")
From the the histogram, bulk of the users are in the age group of 25- 35
temp = pd.DataFrame(users.plan.value_counts())
plot_count_bar(users,"plan")
Over 92% of the users are on the standard plan and the bulk of the users are using STANDARD or PERMIUM plans
users.columns
plot_count_bar(users, "country")
32% of the user base is from Great Britian
plot_count_bar(users, "city", top_n=10)
I have plotted the top 10 cities here, among the top 10 cities, London has ~42 % of the users and Warszawa comes next with ~10 % users
(users.created_date.value_counts()>1).values
users.created_date[(users.created_date.value_counts()>1).values]
No 2 users signed up on the same day, since this is munged data, this might be the case
plot_count_bar(users,"attributes_notifications_marketing_push" )
Only ~5% of the users did not opt for notifications
plot_count_bar(users,"attributes_notifications_marketing_email" )
~ 90 % of the users signed up for marketing emails
plot_count_bar(users,"user_settings_crypto_unlocked")
only 19% of the users unlocked crypto
plot_count_bar(notifications, "channel")
Among the 3 notification channels, SMS is the least used mode
notifications.columns
plot_count_bar(notifications, "status")
~ 27 % of the notifications failed to be delivered
temp = notifications.groupby(["channel", "status"]).size().to_frame().reset_index()
temp.columns = ["channel", "status", "counts"]
temp
failed_x = temp.channel[temp.status=="FAILED"]
failed_y = (temp.counts[temp.status=="FAILED"]/temp.counts.sum())*100
sent_x = temp.channel[temp.status=="SENT"]
sent_y = (temp.counts[temp.status=="SENT"]/temp.counts.sum())*100
fig = go.Figure(data=[
go.Bar(name='FAILED', x=failed_x, y=failed_y),
go.Bar(name='SENT', x=sent_x, y=sent_y),
])
# Change the bar mode
fig.update_layout(barmode='group',title = "Percentage of notifications sent and failed (% calculated out of total)")
fig.show()
Most of the failed notifications were in the email mode
plot_count_bar(notifications, "reason")
Most notifications were sent for REENGAGEMENT_ACTIVE_FUNDS (almost 30%)
plot_count_bar(devices,"brand")
The user base is equally sperad among apple and ancdroid
transactions.columns
plot_count_bar(transactions, "transactions_type")
Customers are mostly using the platform for card payment and transfer.
plot_count_bar(transactions, "transactions_currency")
Most customers are dealing with Euros
transactions.columns
transactions.transactions_state.value_counts()
plot_count_bar(transactions,"transactions_state")
There is only a 1.5% probability that a transaction will fail
-This shows that the platform is very stable
transactions.created_date.value_counts()
The maximum mnumber of transactions on a single day are just 4, let us roll up the days into weekdays, weeks, months, etc and check
datetime.strptime('January 11, 2010', '%B %d, %Y').strftime('%a')
transactions["dayofweek"] = transactions.created_date.apply(lambda x : x.strftime('%a'))
plot_count_bar(transactions,"dayofweek")
All the days of the week seem to have similar transaction volume
- Not sure if this is due to masked data
transactions["transactionmonth"] = transactions.created_date.apply(lambda x : x.strftime('%m'))
plot_count_bar(transactions, "transactionmonth")
March and April have most transactions, there was dip in the number of transactions in the month of july followed by a steady rise again
transactions.columns
# plot_multi_box(transactions,"transactions_type", "amount_usd")
There are outliers in the trasfer group, more than 10Billion USD were transferred!, this makes the plot out of scale and is not helping with understanding the other modes of engagement, I will filter the transactions above 1 Billion USD and continue the analysis for Exploratory purposes
Note : Plotting the transactions is slowing down my system, so I will take a sizable sample of the transactions and continue the analysis
transactions_sample = transactions.sample(frac = 0.1)
plot_multi_box(transactions_sample,"transactions_type", "amount_usd")
transactions_sample.groupby("transactions_type").agg({"amount_usd" : [np.median,np.mean,np.min,np.max,np.std]}).reset_index()
Findings
transactions_filtered = transactions[transactions.amount_usd <= 1e4]
transactions.shape[0] - transactions_filtered.shape[0]
((transactions.shape[0] - transactions_filtered.shape[0])/transactions.shape[0])*100
There are 1133 High value transactions in the dataset, above 10,000 USD.
These values might be important if we are doing a fraud analysis with the dataset
transactions_filtered.groupby("transactions_type").agg({"amount_usd" : [np.median,np.mean,np.min,np.max,np.std]}).reset_index()
Findings
plot_multi_box(transactions_filtered.sample(frac=0.3),"transactions_type", "amount_usd")
The ATM transactions are unde 2000 USD, while card payment, transfer, exchage and topup seem to be goin up tp 10,000 USD
num_hv_transactions = transactions.shape[0] - transactions_filtered.shape[0]
def card_holder_presence(x):
if x == "TRUE":
return True
else:
return False
transactions.ea_cardholderpresence = transactions.ea_cardholderpresence.apply(lambda x : card_holder_presence(x))
num_time_user_not_present_hv = np.sum(transactions.ea_cardholderpresence[transactions.amount_usd >1e4])
num_time_user_not_present_hv
num_hv_transactions
(num_time_user_not_present_hv/num_hv_transactions) *100
%.3% of the time whe n high value transactions took place, the user was not present
num_time_user_not_present_lv = np.sum(transactions.ea_cardholderpresence[transactions.amount_usd <= 1e4])
num_time_user_not_present_lv
num_time_user_not_present_lv/transactions.ea_cardholderpresence[transactions.amount_usd <=1e4].shape[0]
The user was not present in 10% of the low value transactions
transactions.to_csv("data/transactions_processed.csv", index = False)
users.to_csv("data/users_processed.csv", index = False)
notifications.to_csv("data/notifications_processed.csv", index= False)